*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* Replication Files to: P-hacking in clinical trials and how incentives shape the distribution of results across phases 
* Date: April 2020
* Authors: Jérôme Adda, Christian Decker, and Marco Ottaviani
*
*	- Input: 'data_selection_functions.dta', 'data_counterfactual_analysis.dta'
* 			 		 
*	- Output: Figures 4 and S4; Table S7; 'robustness_counterfactual.dta'
*			 
* Topic: Counterfactual Analysis
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

*------------------------*
* Section 0 -- Directory *
*------------------------*

clear all
clear matrix
cap log close
set more off
pause off

*-----*
* 0.1 *
*-----*

* Run the dofiles which defines all the globals for the folder's and sub-folders
* paths to both data and analysis.

if c(username) == "chrdec" {													
	do "C:/Users/chrdec/Dropbox/clinical trials/stata/PNAS revision/2_analysis/dofiles/0_globals.do"   			// 0_globals.do path

}

*-----*
* 0.2 * 
*-----*

* Set up the directory where data are stored
macro list
cd "${data}"
set matsize 11000

cap log off
log using "${run}${log}log_5.txt", replace

*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~

*-------------------*
* Table of content: *
*-------------------*
* Sections: 
* 1. Set Options
* 2. Selection Subroutine 
* 3. Estimation Subroutine
* 4. Bootstrap Subroutine
* 5. ESTIMATION
* 6. Graphs
* 7. Robustness



*--------------------------*
* Section 1 -- Set Options *
*--------------------------*

local bandw "sjpi"
local rep=500
local path "${run}${table}tabS7.xls"
local covariates z_Ph2 D1 D2 sqrt_enrollment placebo p_adjust cy_* mc_*
set seed 17392
local SPLIT top10rev

*-----------------------------------*
* Section 2 -- Selection Subroutine *
*-----------------------------------*

capture program drop myselection
program define myselection,nclass
args indu split_cond covariates boots
preserve
use "${data}${final}data_selection_functions.dta",clear
keep if INDUSTRY==`indu'
keep if `split_cond'

if `boots'==1 {
bsample, cl(nct_id)
}
gen aux=1 if D0==1 & z_modifier!=0
count if aux==1
local N=r(N)
sort aux z
forvalues j=1(1)`N' {
if z_modifier[`j']==-1 {
qui sum z if z<=z[`j'] & z_modifier==0 ,d
qui replace z=r(mean) in `j'
}
if z_modifier[`j']==1 {
qui sum z if z>=z[`j'] & z_modifier==0,d
qui replace z=r(mean) in `j'
}
}

gen z_Ph2=D0*z
logit matched `covariates'
restore
end


*------------------------------------*
* Section 3 -- Estimation Subroutine *
*------------------------------------*

capture program drop myestimation
program define myestimation,rclass
args indu split_cond covariates fig
preserve
keep if INDUSTRY==`indu'
keep if `split_cond'
qui count
local Nt=r(N)


gen aux=1 if D0==1 & z_modifier!=0


forvalues a=2(1)3 {

gen here=1 if Ph`a'==1 

qui count if Ph`a'==1 & aux==1
local N=r(N)

sort here aux z

forvalues j=1(1)`N' {
if z_modifier[`j']==-1 {
qui sum z if z<=z[`j'] & z_modifier==0 & Ph`a'==1,d
qui replace z=r(mean) in `j'
}
if z_modifier[`j']==1 {
qui sum z if z>=z[`j'] & z_modifier==0 & Ph`a'==1,d
qui replace z=r(mean) in `j'
}
}

drop here
}


** predict continuation probabilities
gen w0=1
gen z_Ph2=D0*z if Ph2==1
predict w1 if Ph2==1, pr

** estimate kernel densities and construct counterfacuals **
qui sum w0 if D0==1 & Ph2==1
local swPh2_0=r(sum)
qui sum w0 if D1==1 & Ph2==1
local swPh2_1=r(sum)
qui sum w0 if D2==1 & Ph2==1
local swPh2_2=r(sum)

qui sum w0 if D0==1 & Ph3==1
local swPh3_0=r(sum)
qui sum w0 if D1==1 & Ph3==1
local swPh3_1=r(sum)
qui sum w0 if D2==1 & Ph3==1
local swPh3_2=r(sum)


gen grid=(_n-1)/50
replace grid=. if grid>6

forvalues j=2(1)3 {
kdens z [aw=w0] if Ph`j'==1 & D0==1,at(grid) gen(d1ph`j') bw(`bandw') k(e) ll(0) hist(100) nogr replace
}

local Ph2_0a=`swPh2_0'/(`swPh2_0'+`swPh2_1'+`swPh2_2')
local Ph2_1a=`swPh2_1'/(`swPh2_0'+`swPh2_1'+`swPh2_2')
local Ph2_2a=`swPh2_2'/(`swPh2_0'+`swPh2_1'+`swPh2_2')

local Ph3_0=`swPh3_0'/(`swPh3_0'+`swPh3_1'+`swPh3_2')
local Ph3_1=`swPh3_1'/(`swPh3_0'+`swPh3_1'+`swPh3_2')
local Ph3_2=`swPh3_2'/(`swPh3_0'+`swPh3_1'+`swPh3_2')


kdens z [aw=w1] if Ph2==1 & D0==1,at(grid) gen(d1_ad1) bw(`bandw') k(e) ll(0) hist(100) nogr replace

qui sum w1 if D0==1 & Ph2==1
local sw1_0=r(sum)
qui sum w1 if D1==1 & Ph2==1
local sw1_1=r(sum)
qui sum w1 if D2==1 & Ph2==1
local sw1_2=r(sum)

local Ph2_0b=`sw1_0'/(`sw1_0'+`sw1_1'+`sw1_2')
local Ph2_1b=`sw1_1'/(`sw1_0'+`sw1_1'+`sw1_2')
local Ph2_2b=`sw1_2'/(`sw1_0'+`sw1_1'+`sw1_2')

sort grid

integ d1ph2 grid if grid<1.96
local ph2_sig=(1-r(integral))*`Ph2_0a'+`Ph2_1a'+`Ph2_2a'

integ d1ph3 grid if grid<1.96
local ph3a_sig=(1-r(integral))*`Ph3_0'+`Ph3_1'+`Ph3_2'

integ d1_ad1 grid if grid<1.96
local ph3p1_sig=(1-r(integral))*`Ph2_0b'+`Ph2_1b'+`Ph2_2b'

return scalar ph2_sig=`ph2_sig'
return scalar ph3a_sig=`ph3a_sig'
return scalar ph3p1_sig=`ph3p1_sig'

return scalar dif_3a_2=`ph3a_sig'-`ph2_sig'
return scalar dif_3a_3p1=`ph3a_sig'-`ph3p1_sig'
return scalar dif_3p1_2=`ph3p1_sig'-`ph2_sig'

return scalar N=`Nt'

if "`fig'"!="NO" {

if "`fig'"=="A" {
  local spons "All industry"
}
if "`fig'"=="B" {
  local spons "Small industry"
}
if "`fig'"=="C" {
  local spons "Top 10 industry"
}


graph twoway (line d1ph2 grid, lcolor(ebblue) lwidth(thick) lpattern(solid)) (line d1ph3 grid, lcolor(gs6) lwidth(thick) lpattern(solid)) (line d1_ad1 grid, lcolor(mint) lwidth(thick) lpattern(longdash)) if grid<=4, ytitle(Density) xtitle(z) xline(1.96, lwidth(medthick) lpattern(dash) lcolor(black)) text(0.5 2.32 "z=1.96") xlabel( , format(%03.1f)) ylabel( , format(%03.1f)) title("{bf:`fig'}", pos(11) size(huge)) subtitle("`spons'",pos(12) size(large)) name(`fig',replace) graphregion(color(white)) bgcolor(white) legend(order(1 "[Ph2]" 3 "[Ph2+SC]" 2 "[Ph3]") c(1)) nodraw plotr(ls(none))
}

restore
end


*-----------------------------------*
* Section 4 -- Bootstrap Subroutine *
*-----------------------------------*

capture program drop myboot
program define myboot,rclass
args indu split_cond covariates
preserve
myselection `indu' `split_cond' "`covariates'" 1 
bsample , strata(Ph3) cl(nct_id)
myestimation `indu' `split_cond' "`covariates'" "NO"
return scalar ph2_sig=r(ph2_sig)
return scalar ph3a_sig=r(ph3a_sig)
return scalar ph3p1_sig=r(ph3p1_sig)

return scalar dif_3a_2=r(dif_3a_2)
return scalar dif_3a_3p1=r(dif_3a_3p1)
return scalar dif_3p1_2=r(dif_3p1_2)

restore
end

*-------------------------*
* Section 5 -- ESTIMATION *
*-------------------------*

*-----*
* 5.1 *
*-----*

* All Industry Sponsors

use "${data}${final}data_counterfactual_analysis.dta", clear

keep if Ph2==1 | Ph3==1
forvalues j=2(1)3 {
qui count if INDUSTRY==1 & Ph`j'==1
local obs`j'=r(N)
qui unique nct_id if INDUSTRY==1 & Ph`j'==1
local trial`j'=r(sum)
}
myselection 1 1 "`covariates'" 0
myestimation 1 1 "`covariates'" "A"
matrix parameters=(r(ph2_sig), r(ph3a_sig), r(ph3p1_sig), r(dif_3a_2), r(dif_3a_3p1), r(dif_3p1_2))
matrix param1=(r(ph2_sig), r(ph3p1_sig), r(ph3a_sig))
local N=r(N)
simulate ph2_sig=r(ph2_sig) ph3a_sig=r(ph3a_sig) ph3p1_sig=r(ph3p1_sig) dif_3a_2=r(dif_3a_2) dif_3a_3p1=r(dif_3a_3p1) dif_3p1_2=r(dif_3p1_2), reps(`rep') :myboot 1 1 "`covariates'"
bstat ph2_sig ph3a_sig ph3p1_sig dif_3a_2 dif_3a_3p1 dif_3p1_2, stat(parameters) n(`N')
outreg2 using "`path'", replace adds("Observations Ph2", `obs2', "Observations Ph3", `obs3', "No. of trials Ph2", `trial2', "No. of trials Ph3", `trial3')


*-----*
* 5.2 *
*-----*

* Small Industry Sponsors

use "${data}${final}data_counterfactual_analysis.dta", clear

keep if Ph2==1 | Ph3==1
forvalues j=2(1)3 {
qui count if INDUSTRY==1 & `SPLIT'==0 & Ph`j'==1
local obs`j'=r(N)
qui unique nct_id if INDUSTRY==1 & `SPLIT'==0 & Ph`j'==1
local trial`j'=r(sum)
}
myselection 1 "`SPLIT'==0" "`covariates'" 0
myestimation 1 "`SPLIT'==0" "`covariates'" "B"
matrix parameters=(r(ph2_sig), r(ph3a_sig), r(ph3p1_sig), r(dif_3a_2), r(dif_3a_3p1), r(dif_3p1_2))
matrix param10=(r(ph2_sig), r(ph3p1_sig), r(ph3a_sig))
local N=r(N)
simulate ph2_sig=r(ph2_sig) ph3a_sig=r(ph3a_sig) ph3p1_sig=r(ph3p1_sig) dif_3a_2=r(dif_3a_2) dif_3a_3p1=r(dif_3a_3p1) dif_3p1_2=r(dif_3p1_2), reps(`rep') :myboot 1 "`SPLIT'==0" "`covariates'"
bstat ph2_sig ph3a_sig ph3p1_sig dif_3a_2 dif_3a_3p1 dif_3p1_2, stat(parameters) n(`N')
outreg2 using "`path'", append adds("Observations Ph2", `obs2', "Observations Ph3", `obs3', "No. of trials Ph2", `trial2', "No. of trials Ph3", `trial3')


*-----*
* 5.3 *
*-----*

* Top 10 Industry Sponsors

use "${data}${final}data_counterfactual_analysis.dta", clear

keep if Ph2==1 | Ph3==1
forvalues j=2(1)3 {
qui count if INDUSTRY==1 & `SPLIT'==1 & Ph`j'==1
local obs`j'=r(N)
qui unique nct_id if INDUSTRY==1 & `SPLIT'==1 & Ph`j'==1
local trial`j'=r(sum)
}
myselection 1 "`SPLIT'==1" "`covariates'" 0
myestimation 1 "`SPLIT'==1" "`covariates'" "C"
matrix parameters=(r(ph2_sig), r(ph3a_sig), r(ph3p1_sig), r(dif_3a_2), r(dif_3a_3p1), r(dif_3p1_2))
matrix param11=(r(ph2_sig), r(ph3p1_sig), r(ph3a_sig))
local N=r(N)
simulate ph2_sig=r(ph2_sig) ph3a_sig=r(ph3a_sig) ph3p1_sig=r(ph3p1_sig) dif_3a_2=r(dif_3a_2) dif_3a_3p1=r(dif_3a_3p1) dif_3p1_2=r(dif_3p1_2), reps(`rep') :myboot 1 "`SPLIT'==1" "`covariates'"
bstat ph2_sig ph3a_sig ph3p1_sig dif_3a_2 dif_3a_3p1 dif_3p1_2, stat(parameters) n(`N')
outreg2 using "`path'", append adds("Observations Ph2", `obs2', "Observations Ph3", `obs3', "No. of trials Ph2", `trial2', "No. of trials Ph3", `trial3')

*---------------------*
* Section 6 -- Graphs *
*---------------------*

*-----*
* 6.1 *
*-----*

* Figure S4

grc1leg A B C, c(2) scheme(s1color) legendfrom(C) ring(0) position(4) ycom name(figS4, replace)
graph export "${run}${graph}figS4.png", replace width(1500)


*-----*
* 6.2 *
*-----*

* Figure 4A

clear
set obs 9
gen significant=.
gen sponsor=""
gen phase=""
replace sponsor="All industry" in 1/3
replace sponsor="Small industry" in 4/6
replace sponsor="Top 10 industry" in 7/9

replace phase="Phase 2" in 1
replace significant=param1[1,1] in 1
local all2=param1[1,1]
replace phase="Phase 2 + selective continuation" in 2
replace significant=param1[1,2] in 2
local allp=param1[1,2]
replace phase="Phase 3" in 3
replace significant=param1[1,3] in 3
local all3=param1[1,3]

replace phase="Phase 2" in 4
replace significant=param10[1,1] in 4
local small2=param10[1,1]
replace phase="Phase 2 + selective continuation" in 5
replace significant=param10[1,2] in 5
local smallp=param10[1,2]
replace phase="Phase 3" in 6
replace significant=param10[1,3] in 6
local small3=param10[1,3]

replace phase="Phase 2" in 7
replace significant=param11[1,1] in 7
local large2=param11[1,1]
replace phase="Phase 2 + selective continuation" in 8
replace significant=param11[1,2] in 8
local largep=param11[1,2]
replace phase="Phase 3" in 9
replace significant=param11[1,3] in 9
local large3=param11[1,3]


sort phase sponsor
by phase: gen i=_n
sort sponsor phase
by sponsor: gen j=_n
drop phase
reshape wide significant, i(i) j(j)
rename significant1 Ph2
gen cont=significant2-Ph2
gen res=significant3-significant2

gen trick1=.00000001
gen trick2=.00000001

local star1_all=(`allp'-`all2')/2+`all2'
local star2_all=(`all3'-`allp')/2+`allp'
local star1_small=(`smallp'-`small2')/2+`small2'
local star2_small=(`small3'-`smallp')/2+`smallp'
local star1_large=(`largep'-`large2')/2+`large2'
local star2_large=(`large3'-`largep')/2+`largep'

graph hbar Ph2 trick1 cont res trick2, over(sponsor) stack ytitle("Share significant") ylabel( , format(%03.1f)) bar(1,col(ebblue)) bar(2,col(black) lc(black) lw(thick)) bar(3,col(mint)) bar(4,col(gs13)) bar(5,col(black) lc(black) lw(medium)) text(`small2' 36 "Phase II") text(`large2' 1 "Phase II") text(`all2' 71 "Phase II") text(`small3' 36 "Phase III") text(`large3' 1 "Phase III") text(`all3' 71 "Phase III") text(`star2_all' 97 "***") text(`star1_all' 97 "***") text(`star2_small' 61 "***") text(`star1_small' 61 "**") text(`star1_large' 26 "***") legend(order(1 3 4) c(3) label(1 "Phase II") label(3 "Selective continuation") label(4 "Residual to phase III")) title("{bf:A}", pos(11) size(huge)) name(fig3A,replace) graphregion(color(white)) bgcolor(white) plotr(ls(none)) ysize(8) xsize(20)
graph export "${run}${graph}fig4A.png", replace width(1500)


*-------------------------*
* Section 7 -- Robustness *
*-------------------------*

local criteria rev_rank rxsales_rank rdspend_rank rep_rank

tempname memhold
postfile `memhold' str20 criterion rank fracSC_small fracSC_large using "${data}${final}robustness_counterfactual.dta", replace

foreach criterion of local criteria {
forval c = 7 / 20 {

disp in red "`criterion'  `c'"
quietly {

* Small Industry Sponsors

use "${data}${final}data_counterfactual_analysis.dta", clear
keep if Ph2==1 | Ph3==1
myselection 1 "(`criterion'<=`c')==0" "`covariates'" 0
myestimation 1 "(`criterion'<=`c')==0" "`covariates'" "NO"
local fs=(r(dif_3p1_2)/r(dif_3a_2))

* Large Industry Sponsors

use "${data}${final}data_counterfactual_analysis.dta", clear
keep if Ph2==1 | Ph3==1
myselection 1 "(`criterion'<=`c')==1" "`covariates'" 0
myestimation 1 "(`criterion'<=`c')==1" "`covariates'" "NO"
local fl=r(dif_3p1_2)/r(dif_3a_2)

post `memhold' ("`criterion'") (`c') (`fs') (`fl')

}
}
}

postclose `memhold'


use "${data}${final}robustness_counterfactual.dta", clear
gen perSC_small = fracSC_small*100
gen perSC_large = fracSC_large*100


graph twoway (histogram perSC_small, frac width(2) start(0) fcol(mint) lcol(grey) lwidth(vvthin)) , ylabel(0(.05)0.2, format(%03.2f)) xlabel(0(20)105) ytitle("Fraction", size(large)) xtitle("% [Ph3]-[Ph2] explained by selective continuation", size(large)) title("{bf:B}", pos(11) size(huge)) subtitle("Small industry",pos(12) size(large)) name(robB,replace) graphregion(color(white)) bgcolor(white) plotr(ls(none)) 
graph export "${run}${graph}fig4B.png", replace width(1500)
graph twoway (histogram perSC_large, frac width(2) start(0) fcol(mint) lcol(grey) lwidth(vvthin)) , ylabel(0(.05)0.2, format(%03.2f)) xlabel(0(20)105) ytitle("Fraction", size(large)) xtitle("% [Ph3]-[Ph2] explained by selective continuation", size(large)) title("{bf:C}", pos(11) size(huge)) subtitle("Large industry",pos(12) size(large)) name(robC,replace) graphregion(color(white)) bgcolor(white) plotr(ls(none))
graph export "${run}${graph}fig4C.png", replace width(1500)


log close 
cd "${run}${dofiles}"
